{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "import time\n",
    "import random\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看数据集分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "      <th>toxic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000997932d777bf</td>\n",
       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000103f0d9cfb60f</td>\n",
       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                       comment_text  toxic\n",
       "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0\n",
       "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv(\"data/train_one_label.csv\").head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>00001cee341fdb12</td>\n",
       "      <td>Yo bitch Ja Rule is more succesful then you'll...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000247867823ef7</td>\n",
       "      <td>== From RfC == \\n\\n The title is fine as it is...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                       comment_text\n",
       "0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...\n",
       "1  0000247867823ef7  == From RfC == \\n\\n The title is fine as it is..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv(\"data/test.csv\").head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用torchtext构建数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext import data\n",
    "from torchtext.vocab import Vectors\n",
    "from torch.nn import init\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenize = lambda x: x.split()\n",
    "TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)\n",
    "LABEL = data.Field(sequential=False, use_vocab=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    # 若不想自定义继承自Dataset的类MyDataset，也可直接使用torchtext.data.Dataset来构建数据集\n",
    "    # 完整示例如下\n",
    "    def get_dataset(csv_data, text_field, label_field, test=False):\n",
    "\n",
    "        fields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n",
    "                     (\"comment_text\", text_field), (\"toxic\", label_field)]       \n",
    "        examples = []\n",
    "\n",
    "        if test:\n",
    "            # 如果为测试集，则不加载label\n",
    "            for text in tqdm(csv_data['comment_text']):\n",
    "                examples.append(data.Example.fromlist([None, text, None], fields))\n",
    "        else:\n",
    "            for text, label in tqdm(zip(csv_data['comment_text'], csv_data['toxic'])):\n",
    "                examples.append(data.Example.fromlist([None, text, label], fields))\n",
    "        return examples, fields\n",
    "\n",
    "    train_data = pd.read_csv('data/train_one_label.csv')\n",
    "    valid_data = pd.read_csv('data/valid_one_label.csv')\n",
    "    test_data = pd.read_csv(\"data/test.csv\")\n",
    "    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True)\n",
    "    LABEL = data.Field(sequential=False, use_vocab=False)\n",
    "\n",
    "    # 得到构建Dataset所需的examples和fields\n",
    "    train_examples, train_fields = get_dataset(train_data, TEXT, LABEL)\n",
    "    valid_examples, valid_fields = get_dataset(valid_data, TEXT, LABEL)\n",
    "    test_examples, test_fields = get_dataset(test_data, TEXT, None, test=True)\n",
    "    # 构建Dataset数据集\n",
    "    train = data.Dataset(train_examples, train_fields)\n",
    "    valid = data.Dataset(valid_examples, valid_fields)\n",
    "    test = data.Dataset(test_examples, test_fields)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 自定义Dataset类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'data/train_one_label.csv'\n",
    "valid_path = \"data/valid_one_label.csv\"\n",
    "test_path = \"data/test.csv\"\n",
    "\n",
    "# 定义Dataset\n",
    "class MyDataset(data.Dataset):\n",
    "    name = 'Grand Dataset'\n",
    "\n",
    "    @staticmethod\n",
    "    def sort_key(ex):\n",
    "        return len(ex.text)\n",
    "\n",
    "    def __init__(self, path, text_field, label_field, test=False, aug=False, **kwargs):\n",
    "        fields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n",
    "                 (\"comment_text\", text_field), (\"toxic\", label_field)]\n",
    "        \n",
    "        examples = []\n",
    "        csv_data = pd.read_csv(path)\n",
    "        print('read data from {}'.format(path))\n",
    "\n",
    "        if test:\n",
    "            # 如果为测试集，则不加载label\n",
    "            for text in tqdm(csv_data['comment_text']):\n",
    "                examples.append(data.Example.fromlist([None, text, None], fields))\n",
    "        else:\n",
    "            for text, label in tqdm(zip(csv_data['comment_text'], csv_data['toxic'])):\n",
    "                if aug:\n",
    "                    # do augmentation\n",
    "                    rate = random.random()\n",
    "                    if rate > 0.5:\n",
    "                        text = self.dropout(text)\n",
    "                    else:\n",
    "                        text = self.shuffle(text)\n",
    "                # Example: Defines a single training or test example.Stores each column of the example as an attribute.\n",
    "                examples.append(data.Example.fromlist([None, text, label], fields))\n",
    "        # 之前是一些预处理操作，此处调用super调用父类构造方法，产生标准Dataset\n",
    "        # super(MyDataset, self).__init__(examples, fields, **kwargs)\n",
    "        super(MyDataset, self).__init__(examples, fields)\n",
    "\n",
    "    def shuffle(self, text):\n",
    "        text = np.random.permutation(text.strip().split())\n",
    "        return ' '.join(text)\n",
    "\n",
    "    def dropout(self, text, p=0.5):\n",
    "        # random delete some text\n",
    "        text = text.strip().split()\n",
    "        len_ = len(text)\n",
    "        indexs = np.random.choice(len_, int(len_ * p))\n",
    "        for i in indexs:\n",
    "            text[i] = ''\n",
    "        return ' '.join(text)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构建数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "25it [00:00, 8599.82it/s]\n",
      "25it [00:00, 9660.73it/s]\n",
      "100%|██████████| 33/33 [00:00<00:00, 28926.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "read data from data/train_one_label.csv\n",
      "read data from data/valid_one_label.csv\n",
      "read data from data/test.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "train = MyDataset(train_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)\n",
    "valid = MyDataset(valid_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)\n",
    "# 因为test没有label,需要指定label_field为None\n",
    "test = MyDataset(test_path, text_field=TEXT, label_field=None, test=True, aug=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['comment_text', 'toxic'])\n",
      "dict_keys(['comment_text'])\n"
     ]
    }
   ],
   "source": [
    "print(train[0].__dict__.keys())\n",
    "print(test[0].__dict__.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'at', 'explanation', 'dolls', 'just', 'talk', 'new', 'since', 'closure', 'reverted?', 'retired', 'fac.', 'after', 'york', 'now.89.205.38.27', 'they', 'gas', 'edits', 'hardcore', 'fan', 'template', 'from', 'username', 'were', \"weren't\", 'please', 'the', 'my', 'page', \"don't\", 'the', 'remove', 'on', 'vandalisms,', 'and', 'made', 'why', 'metallica', 'some', \"i'm\", 'under', 'voted', 'the']\n"
     ]
    }
   ],
   "source": [
    "print(train[0].comment_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构建词表，最简单的方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEXT.build_vocab(train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  通过预训练的词向量来构建词表的方式示例，以glove.6B.300d词向量为例\n",
    "    cache = 'mycache'\n",
    "    if not os.path.exists(cache):\n",
    "        os.mkdir(cache)\n",
    "    vectors = Vectors(name='/Users/wyw/Documents/vectors/glove/glove.6B.300d.txt', cache=cache)\n",
    "    # 指定 Vector 缺失值的初始化方式，没有命中的token的初始化方式\n",
    "    vectors.unk_init = init.xavier_uniform_ \n",
    "    TEXT.build_vocab(train, min_freq=5, vectors=vectors)\n",
    "    # 查看词表元素\n",
    "    TEXT.vocab.vectors\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('the', 65),\n",
       " ('to', 35),\n",
       " ('you', 29),\n",
       " ('of', 26),\n",
       " ('and', 24),\n",
       " ('is', 19),\n",
       " ('that', 19),\n",
       " ('a', 19),\n",
       " ('i', 18),\n",
       " ('this', 16)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TEXT.vocab.freqs.most_common(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构建数据集迭代器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext.data import Iterator, BucketIterator\n",
    "# 若只针对训练集构造迭代器\n",
    "# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)\n",
    "\n",
    "# 同时对训练集和验证集进行迭代器的构建\n",
    "train_iter, val_iter = BucketIterator.splits(\n",
    "        (train, valid), # 构建数据集所需的数据集\n",
    "        batch_sizes=(8, 8),\n",
    "        device=-1, # 如果使用gpu，此处将-1更换为GPU的编号\n",
    "        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.\n",
    "        sort_within_batch=False,\n",
    "        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.\n",
    ")\n",
    "\n",
    "test_iter = Iterator(test, batch_size=8, device=-1, sort=False, sort_within_batch=False, repeat=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[torchtext.data.batch.Batch of size 8 from GRAND DATASET]\n",
      "\t[.comment_text]:[torch.LongTensor of size 200x8]\n",
      "\t[.toxic]:[torch.LongTensor of size 8]\n",
      "torch.Size([200, 8]) torch.Size([8])\n",
      "\n",
      "[torchtext.data.batch.Batch of size 8 from GRAND DATASET]\n",
      "\t[.comment_text]:[torch.LongTensor of size 200x8]\n",
      "\t[.toxic]:[torch.LongTensor of size 8]\n",
      "torch.Size([200, 8]) torch.Size([8])\n",
      "\n",
      "[torchtext.data.batch.Batch of size 1 from GRAND DATASET]\n",
      "\t[.comment_text]:[torch.LongTensor of size 200x1]\n",
      "\t[.toxic]:[torch.LongTensor of size 1]\n",
      "torch.Size([200, 1]) torch.Size([1])\n",
      "\n",
      "[torchtext.data.batch.Batch of size 8 from GRAND DATASET]\n",
      "\t[.comment_text]:[torch.LongTensor of size 200x8]\n",
      "\t[.toxic]:[torch.LongTensor of size 8]\n",
      "torch.Size([200, 8]) torch.Size([8])\n"
     ]
    }
   ],
   "source": [
    "for idx, batch in enumerate(train_iter):\n",
    "    print(batch)\n",
    "    text, label = batch.comment_text, batch.toxic\n",
    "    print(text.shape, label.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用torchtext构建的数据集实现LSTM\n",
    "- 因数据集太小，无法收敛，只作为demo熟悉torchtext和pytorch之间的用法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "weight_matrix = TEXT.vocab.vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LSTM(nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        super(LSTM, self).__init__()\n",
    "        self.word_embeddings = nn.Embedding(len(TEXT.vocab), 300)  # embedding之后的shape: torch.Size([200, 8, 300])\n",
    "        # 若使用预训练的词向量，需在此处指定预训练的权重\n",
    "        # embedding.weight.data.copy_(weight_matrix)\n",
    "        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1)  # torch.Size([200, 8, 128])\n",
    "        self.decoder = nn.Linear(128, 2)\n",
    "\n",
    "    def forward(self, sentence):\n",
    "        embeds = self.word_embeddings(sentence)\n",
    "        lstm_out = self.lstm(embeds)[0]  # lstm_out:200x8x128\n",
    "        # 取最后一个时间步\n",
    "        final = lstm_out[-1]  # 8*128\n",
    "        y = self.decoder(final)  # 8*2 \n",
    "        return y\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LSTM()\n",
    "model.train()\n",
    "optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)\n",
    "loss_funtion = F.cross_entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0.7700, grad_fn=<NllLossBackward>)\n",
      "tensor(0.0174, grad_fn=<NllLossBackward>)\n",
      "tensor(0.7216, grad_fn=<NllLossBackward>)\n",
      "tensor(0.7747, grad_fn=<NllLossBackward>)\n"
     ]
    }
   ],
   "source": [
    "for epoch, batch in enumerate(train_iter):\n",
    "    optimizer.zero_grad()\n",
    "    predicted = model(batch.comment_text)\n",
    "\n",
    "    loss = loss_funtion(predicted, batch.toxic)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    print(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:pytorch]",
   "language": "python",
   "name": "conda-env-pytorch-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
